{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import Dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The watermark extension is already loaded. To reload it, use:\n",
      "  %reload_ext watermark\n",
      "Anthony Abercrombie 2017-01-28 13:19:02 \n",
      "\n",
      "CPython 3.5.2\n",
      "IPython 5.1.0\n",
      "\n",
      "numpy 1.11.2\n",
      "pandas 0.19.2+0.g825876c.dirty\n",
      "matplotlib 1.5.1\n",
      "Git hash: a95b6218ad551104f5918a31702c6ed9b0d316e3\n"
     ]
    }
   ],
   "source": [
    "# Environment at time of execution\n",
    "%load_ext watermark\n",
    "%pylab inline\n",
    "%watermark -a \"Anthony Abercrombie\" -d -t -v -p numpy,pandas,matplotlib -g"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "\n",
    "import os\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import dotenv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import sys\n",
    "import dotenv\n",
    "import subprocess\n",
    "import glob\n",
    "from tqdm import tqdm\n",
    "\n",
    "#File path to get to the project root\n",
    "PROJ_ROOT = os.path.join(os.path.pardir, os.pardir)\n",
    "# add local python functions\n",
    "sys.path.append(os.path.join(PROJ_ROOT, \"src\"))\n",
    "\n",
    "#Load AWS keys as environment variables\n",
    "dotenv_path = os.path.join(PROJ_ROOT, '.env')\n",
    "dotenv.load_dotenv(dotenv_path)\n",
    "\n",
    "AWS_ACCESS_KEY = os.environ.get(\"AWS_ACCESS_KEY\")\n",
    "AWS_SECRET_ACCESS_KEY = os.environ.get(\"AWS_SECRET_ACCESS_KEY\")\n",
    "\n",
    "from __future__ import print_function\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Load the \"autoreload\" extension\n",
    "%load_ext autoreload\n",
    "# always reload modules marked with \"%aimport\"\n",
    "%autoreload 1\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download zipfiles from Kaggle"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In order to download data from Kaggle, you normally need to login to Kaggle and accept terms and conditions. I've done this and exported my cookies into a file **kaggle_cookies.txt** that you can use to bypass the process. This will allow you to download the data using **wget**. \n",
    "\n",
    "I'm showing the code here in markdown to prevent repeat downloads. \n",
    "\n",
    "```python\n",
    "#sample_submission.csv --- 14.89 kb\n",
    "!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/sample_submission.csv.zip\n",
    "\n",
    "#grid_sizes.csv --- 2.17 kb\n",
    "!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/grid_sizes.csv.zip\n",
    "\n",
    "#sixteen band --- 7.30 kb\n",
    "!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/sixteen_band.zip\n",
    "\n",
    "#three_band --- 12.87 kb\n",
    "!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/three_band.zip\n",
    "\n",
    "#train_geojson_v3 --- 14.22 mb\n",
    "!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/train_geojson_v3.zip\n",
    "\n",
    "#train_wkt_v4.csv --- 11.08 mb\n",
    "!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/train_wkt_v4.csv.zip\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Uploading files to S3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Export AWS Credentials to connect with S3\n",
    "export_aws_creds_cmd = 'export AWS_SECRET_ACCESS_KEY={} \\nexport AWS_ACCESS_KEY_ID={}'.format(AWS_SECRET_ACCESS_KEY,AWS_ACCESS_KEY)\n",
    "#Execute command in the shell\n",
    "subprocess.call(export_aws_creds_cmd, shell = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Name of the s3 bucket to dump files in\n",
    "s3_bucket_name = 'dstl-satellite-imagery'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['../../data/raw/grid_sizes.csv.zip', '../../data/raw/sample_submission.csv.zip', '../../data/raw/sixteen_band.zip', '../../data/raw/three_band.zip', '../../data/raw/train_geojson_v3.zip', '../../data/raw/train_wkt_v4.csv.zip']\n"
     ]
    }
   ],
   "source": [
    "#Files we want to upload\n",
    "files_to_upload = glob.glob('{}/data/raw/*'.format(PROJ_ROOT))\n",
    "print(files_to_upload)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 6/6 [00:34<00:00,  4.26s/it]\n"
     ]
    }
   ],
   "source": [
    "#Upload files to s3 with s3_multipart_upload.py\n",
    "for f in tqdm(files_to_upload):\n",
    "    upload_cmd = 's3put -b {} --multipart {}'.format(s3_bucket_name, f)\n",
    "    subprocess.call(upload_cmd, shell = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
